#!pip install pandas
#!pip install numpy
#!pip install matplotlib
#!pip install sklearn
#!pip install scipy
#!pip install yellowbrick
#!pip install more-itertools
#!pip install python-math
#!pip install DateTime
#!pip install sompy
#!pip install ipdb
#!pip install phik
#!pip install -U git+https://github.com/joaopfonseca/SOMPY.git
#!pip install plotly
#!pip install patsy
#!pip install prince
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sompy
import phik
import prince
import statsmodels.api as sm
import itertools
import plotly.express as pltexp
import matplotlib.cm as cm
import warnings
from patsy import dmatrices
from datetime import datetime
from yellowbrick.cluster import KElbowVisualizer
from math import ceil
from sompy.visualization.mapview import View2D
from sompy.visualization.bmuhits import BmuHitsView
from sompy.visualization.hitmap import HitMapView
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.impute import KNNImputer
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import AgglomerativeClustering, KMeans, Birch, MiniBatchKMeans, MeanShift, estimate_bandwidth, DBSCAN
from scipy.cluster.hierarchy import dendrogram
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors
from sklearn.mixture import GaussianMixture
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
warnings.filterwarnings('ignore')
data = pd.read_sas('a2z_insurance.sas7bdat')
data.head()
| CustID | FirstPolYear | BirthYear | EducDeg | MonthSal | GeoLivArea | Children | CustMonVal | ClaimsRate | PremMotor | PremHousehold | PremHealth | PremLife | PremWork | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1985.0 | 1982.0 | b'2 - High School' | 2177.0 | 1.0 | 1.0 | 380.97 | 0.39 | 375.85 | 79.45 | 146.36 | 47.01 | 16.89 |
| 1 | 2.0 | 1981.0 | 1995.0 | b'2 - High School' | 677.0 | 4.0 | 1.0 | -131.13 | 1.12 | 77.46 | 416.20 | 116.69 | 194.48 | 106.13 |
| 2 | 3.0 | 1991.0 | 1970.0 | b'1 - Basic' | 2277.0 | 3.0 | 0.0 | 504.67 | 0.28 | 206.15 | 224.50 | 124.58 | 86.35 | 99.02 |
| 3 | 4.0 | 1990.0 | 1981.0 | b'3 - BSc/MSc' | 1099.0 | 4.0 | 1.0 | -16.99 | 0.99 | 182.48 | 43.35 | 311.17 | 35.34 | 28.34 |
| 4 | 5.0 | 1986.0 | 1973.0 | b'3 - BSc/MSc' | 1763.0 | 4.0 | 1.0 | 35.23 | 0.90 | 338.62 | 47.80 | 182.59 | 18.78 | 41.45 |
# dataset data types
data.dtypes
CustID float64 FirstPolYear float64 BirthYear float64 EducDeg object MonthSal float64 GeoLivArea float64 Children float64 CustMonVal float64 ClaimsRate float64 PremMotor float64 PremHousehold float64 PremHealth float64 PremLife float64 PremWork float64 dtype: object
#if there are empty values, replace them with NaN
data.replace("", np.nan, inplace=True)
# count of missing values
data.isna().sum()
CustID 0 FirstPolYear 30 BirthYear 17 EducDeg 17 MonthSal 36 GeoLivArea 1 Children 21 CustMonVal 0 ClaimsRate 0 PremMotor 34 PremHousehold 0 PremHealth 43 PremLife 104 PremWork 86 dtype: int64
# duplicated observations
data.duplicated().sum()
0
# descriptive statistics
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CustID | 10296.0 | 5148.500000 | 2972.343520 | 1.00 | 2574.75 | 5148.50 | 7722.2500 | 10296.00 |
| FirstPolYear | 10266.0 | 1991.062634 | 511.267913 | 1974.00 | 1980.00 | 1986.00 | 1992.0000 | 53784.00 |
| BirthYear | 10279.0 | 1968.007783 | 19.709476 | 1028.00 | 1953.00 | 1968.00 | 1983.0000 | 2001.00 |
| MonthSal | 10260.0 | 2506.667057 | 1157.449634 | 333.00 | 1706.00 | 2501.50 | 3290.2500 | 55215.00 |
| GeoLivArea | 10295.0 | 2.709859 | 1.266291 | 1.00 | 1.00 | 3.00 | 4.0000 | 4.00 |
| Children | 10275.0 | 0.706764 | 0.455268 | 0.00 | 0.00 | 1.00 | 1.0000 | 1.00 |
| CustMonVal | 10296.0 | 177.892605 | 1945.811505 | -165680.42 | -9.44 | 186.87 | 399.7775 | 11875.89 |
| ClaimsRate | 10296.0 | 0.742772 | 2.916964 | 0.00 | 0.39 | 0.72 | 0.9800 | 256.20 |
| PremMotor | 10262.0 | 300.470252 | 211.914997 | -4.11 | 190.59 | 298.61 | 408.3000 | 11604.42 |
| PremHousehold | 10296.0 | 210.431192 | 352.595984 | -75.00 | 49.45 | 132.80 | 290.0500 | 25048.80 |
| PremHealth | 10253.0 | 171.580833 | 296.405976 | -2.11 | 111.80 | 162.81 | 219.8200 | 28272.00 |
| PremLife | 10192.0 | 41.855782 | 47.480632 | -7.00 | 9.89 | 25.56 | 57.7900 | 398.30 |
| PremWork | 10210.0 | 41.277514 | 51.513572 | -12.00 | 10.67 | 25.67 | 56.7900 | 1988.70 |
# make a copy of the dataset
df_data = data.copy()
# set Index
df_data['CustID'] = df_data['CustID'].astype(int)
df_data = df_data.set_index('CustID')
# Histograms for all variables
df_data.hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()
# Box plots for GeoLivArea, to see if the variable is relevant or not
fig = plt.figure(figsize=(2,3))
fig.set_size_inches(20,10)
fig.subplots_adjust(hspace=0.3, wspace=0.3)
gs = fig.add_gridspec(nrows=3,ncols=3)
ax1 = fig.add_subplot(gs[0,0])
ax2 = fig.add_subplot(gs[0,1])
ax3 = fig.add_subplot(gs[0,2])
ax4 = fig.add_subplot(gs[1,0])
ax5 = fig.add_subplot(gs[1,1])
ax6 = fig.add_subplot(gs[1,2])
ax7 = fig.add_subplot(gs[2,0])
ax8 = fig.add_subplot(gs[2,1])
ax9 = fig.add_subplot(gs[2,2])
sns.boxplot(x="GeoLivArea", y="CustMonVal", data=df_data, ax=ax1, color='orange')
sns.boxplot(x="GeoLivArea", y="ClaimsRate", data=df_data, ax=ax2, color='orange')
sns.boxplot(x="GeoLivArea", y="PremMotor", data=df_data, ax=ax3, color='orange')
sns.boxplot(x="GeoLivArea", y="PremHousehold", data=df_data, ax=ax4, color='orange')
sns.boxplot(x="GeoLivArea", y="PremHealth", data=df_data, ax=ax5, color='orange')
sns.boxplot(x="GeoLivArea", y="PremLife", data=df_data, ax=ax6, color='orange')
sns.boxplot(x="GeoLivArea", y="PremWork", data=df_data, ax=ax7, color='orange')
sns.boxplot(x="GeoLivArea", y="MonthSal", data=df_data, ax=ax8, color='orange')
sns.boxplot(x="GeoLivArea", y="FirstPolYear", data=df_data, ax=ax9, color='orange')
<AxesSubplot:xlabel='GeoLivArea', ylabel='FirstPolYear'>
#Coherence check of Birthday
df_data['BirthYear_Check'] = df_data['BirthYear'].apply(lambda x: 1 if (x > 2016 or x < 1926) else 0)
df_data[df_data['BirthYear_Check'] == 1]
| FirstPolYear | BirthYear | EducDeg | MonthSal | GeoLivArea | Children | CustMonVal | ClaimsRate | PremMotor | PremHousehold | PremHealth | PremLife | PremWork | BirthYear_Check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CustID | ||||||||||||||
| 7196 | 1993.0 | 1028.0 | b'3 - BSc/MSc' | 2830.0 | 4.0 | 0.0 | 146.02 | 0.77 | 428.97 | 192.8 | 108.91 | 1.89 | 23.67 | 1 |
#Coherence check of First Year
df_data['FirstPolYear_Check'] = df_data['FirstPolYear'].apply(lambda x: 1 if (x > 2016 or x < 1926) else 0)
df_data[df_data['FirstPolYear_Check'] == 1]
| FirstPolYear | BirthYear | EducDeg | MonthSal | GeoLivArea | Children | CustMonVal | ClaimsRate | PremMotor | PremHousehold | PremHealth | PremLife | PremWork | BirthYear_Check | FirstPolYear_Check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CustID | |||||||||||||||
| 9295 | 53784.0 | 1948.0 | b'3 - BSc/MSc' | 3268.0 | 1.0 | 0.0 | 348.85 | 0.37 | 351.4 | 15.0 | 187.37 | 22.45 | 17.78 | 0 | 1 |
#First_year bigger or equal than Birthday
df_data['FirstPolYear_BirthYear_Check'] = df_data.apply(lambda x: 1 if (x['BirthYear'] > x['FirstPolYear']) else 0, axis=1)
sum(df_data['FirstPolYear_BirthYear_Check'])
1997
#Coherence check for Salary (legal age for working in Portugal is 16)
df_data['Salary_Check'] = df_data.apply(lambda x:1 if (2016 - x['BirthYear'] < 16 and x['MonthSal'] > 0) else 0, axis=1)
df_data[df_data['Salary_Check'] == 1]
| FirstPolYear | BirthYear | EducDeg | MonthSal | GeoLivArea | Children | CustMonVal | ClaimsRate | PremMotor | PremHousehold | PremHealth | PremLife | PremWork | BirthYear_Check | FirstPolYear_Check | FirstPolYear_BirthYear_Check | Salary_Check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CustID | |||||||||||||||||
| 660 | 1991.0 | 2001.0 | b'1 - Basic' | 1284.0 | 4.0 | 1.0 | 243.50 | 0.81 | 46.23 | 1079.65 | 72.68 | 252.27 | -10.00 | 0 | 0 | 1 | 1 |
| 690 | 1997.0 | 2001.0 | b'1 - Basic' | 1180.0 | 3.0 | 1.0 | 436.10 | 0.48 | 54.90 | 406.20 | 65.79 | 112.02 | 256.05 | 0 | 0 | 1 | 1 |
| 866 | 1979.0 | 2001.0 | b'1 - Basic' | 562.0 | 1.0 | 1.0 | 1113.78 | 0.11 | 40.34 | 847.95 | 94.24 | 178.70 | 112.13 | 0 | 0 | 1 | 1 |
| 1437 | 1989.0 | 2001.0 | b'1 - Basic' | 823.0 | 4.0 | 0.0 | 281.28 | 0.60 | 129.58 | 177.80 | 58.01 | 176.70 | 218.15 | 0 | 0 | 1 | 1 |
| 2580 | 1986.0 | 2001.0 | b'1 - Basic' | 1006.0 | 4.0 | 0.0 | 267.95 | 0.67 | 48.23 | 396.75 | 291.50 | 127.58 | 24.45 | 0 | 0 | 1 | 1 |
| 3147 | 1994.0 | 2001.0 | b'1 - Basic' | 372.0 | 4.0 | 1.0 | 488.76 | 0.36 | 60.68 | 270.60 | 177.59 | 182.48 | 107.13 | 0 | 0 | 1 | 1 |
| 4466 | 1978.0 | 2001.0 | b'1 - Basic' | 958.0 | 4.0 | 0.0 | -204.59 | 1.11 | 28.56 | 1359.15 | 68.79 | 38.23 | 179.59 | 0 | 0 | 1 | 1 |
| 4484 | 1983.0 | 2001.0 | b'1 - Basic' | 1241.0 | 3.0 | 1.0 | 909.17 | 0.13 | 43.34 | 589.55 | 142.47 | 121.80 | 179.48 | 0 | 0 | 1 | 1 |
| 5348 | 1980.0 | 2001.0 | b'1 - Basic' | 1119.0 | 3.0 | 1.0 | 796.71 | 0.23 | 148.14 | 628.45 | 221.82 | 45.12 | 30.23 | 0 | 0 | 1 | 1 |
| 5608 | 1974.0 | 2001.0 | b'1 - Basic' | 1131.0 | 3.0 | 0.0 | -25.00 | 1.00 | 43.23 | 690.70 | 157.03 | 87.35 | 142.25 | 0 | 0 | 1 | 1 |
| 8963 | 1978.0 | 2001.0 | b'1 - Basic' | 1117.0 | 4.0 | 0.0 | 1280.58 | 0.11 | 19.56 | 1109.10 | 55.90 | 230.82 | 53.90 | 0 | 0 | 1 | 1 |
| 9907 | 1976.0 | 2001.0 | b'1 - Basic' | 1422.0 | 1.0 | 1.0 | 467.31 | 0.37 | 69.68 | 231.15 | 201.26 | 191.48 | 82.35 | 0 | 0 | 1 | 1 |
#Coherence check for Education (it doesnt make sense to finnish bachelor at 16 or less)
df_data['Education_Check'] = df_data.apply(lambda x:1 if (2016 - x['BirthYear'] <= 16 and x['EducDeg'] == '3 - BSc/MSc') else 0, axis=1)
df_data[df_data['Education_Check'] == 1]
| FirstPolYear | BirthYear | EducDeg | MonthSal | GeoLivArea | Children | CustMonVal | ClaimsRate | PremMotor | PremHousehold | PremHealth | PremLife | PremWork | BirthYear_Check | FirstPolYear_Check | FirstPolYear_BirthYear_Check | Salary_Check | Education_Check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CustID |
#Add column with anual salary
df_data['AnualSal'] = df_data['MonthSal']*14
#Coherence check for Premiums (can't spend more money than they earn)
def summ(num, *args):
total=num
for n in args:
total=total+n
return total
df_data['Premiums_Check']=df_data.apply(lambda x:1 if (summ(x['PremMotor'], x['PremHousehold'], x['PremLife'], x['PremHealth'], x['PremWork'])>(x['AnualSal'])) else 0, axis=1)
df_data[df_data['Premiums_Check'] == 1]
| FirstPolYear | BirthYear | EducDeg | MonthSal | GeoLivArea | Children | CustMonVal | ClaimsRate | PremMotor | PremHousehold | PremHealth | PremLife | PremWork | BirthYear_Check | FirstPolYear_Check | FirstPolYear_BirthYear_Check | Salary_Check | Education_Check | AnualSal | Premiums_Check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CustID | ||||||||||||||||||||
| 9150 | 1985.0 | 1994.0 | b'2 - High School' | 987.0 | 3.0 | 1.0 | 804.05 | 0.97 | 26.34 | 829.05 | 28272.0 | 65.68 | 138.25 | 0 | 0 | 1 | 0 | 0 | 13818.0 | 1 |
df_data.shape
(10296, 20)
df_data = df_data[df_data['Premiums_Check'] != 1] # 1 observation dropped
df_data = df_data[df_data['FirstPolYear_Check'] != 1] # 1 observation dropped
df_data = df_data[df_data['BirthYear_Check'] != 1] # 1 observation dropped
df_data.columns
Index(['FirstPolYear', 'BirthYear', 'EducDeg', 'MonthSal', 'GeoLivArea',
'Children', 'CustMonVal', 'ClaimsRate', 'PremMotor', 'PremHousehold',
'PremHealth', 'PremLife', 'PremWork', 'BirthYear_Check',
'FirstPolYear_Check', 'FirstPolYear_BirthYear_Check', 'Salary_Check',
'Education_Check', 'AnualSal', 'Premiums_Check'],
dtype='object')
# 1997 observations have weird values
del df_data['BirthYear']
del df_data['BirthYear_Check'], df_data['FirstPolYear_Check'], df_data['FirstPolYear_BirthYear_Check'], df_data['Salary_Check'], df_data['Education_Check'], df_data['Premiums_Check'], df_data['AnualSal']
df_data.shape
(10293, 12)
cols_to_check = ['MonthSal', 'CustMonVal', 'ClaimsRate', 'PremMotor', 'PremHousehold','PremHealth', 'PremLife', 'PremWork']
def heatmap_spearman(df):
corr = df
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
f, ax = plt.subplots(figsize=(10, 10))
for item in [f, ax]:
item.patch.set_visible(False)
ax = sns.heatmap(corr, mask=mask, square=True,cmap = plt.cm.Blues)
plt.title('Spearman Correlation Matrix')
plt.show()
heatmap_spearman(df_data.corr(method='spearman').abs())
locator: <matplotlib.colorbar._ColorbarAutoLocator object at 0x000001D708F1A910> Using auto colorbar locator <matplotlib.colorbar._ColorbarAutoLocator object at 0x000001D708F1A910> on colorbar Setting pcolormesh
def heatmap_phik(df):
corr = df
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
f, ax = plt.subplots(figsize=(10, 10))
for item in [f, ax]:
item.patch.set_visible(False)
ax = sns.heatmap(corr, mask=mask, square=True,cmap = plt.cm.Blues)
plt.title('Phik Correlation Matrix')
plt.show()
heatmap_phik(df_data.phik_matrix().abs())
interval columns not set, guessing: ['FirstPolYear', 'MonthSal', 'GeoLivArea', 'Children', 'CustMonVal', 'ClaimsRate', 'PremMotor', 'PremHousehold', 'PremHealth', 'PremLife', 'PremWork']
locator: <matplotlib.colorbar._ColorbarAutoLocator object at 0x000001D708ECFA00> Using auto colorbar locator <matplotlib.colorbar._ColorbarAutoLocator object at 0x000001D708ECFA00> on colorbar Setting pcolormesh
grid = sns.PairGrid(data= df_data, vars = cols_to_check, height = 5)
grid = grid.map_upper(plt.scatter, color = 'orange')
grid = grid.map_diag(plt.hist, bins = 10, color = 'silver')
grid = grid.map_lower(plt.scatter, color = 'orange')
top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved top of axes not in the figure, so title not moved
df_data['EducDeg'] = df_data['EducDeg'].astype(str)
df_data['EducDeg'] = df_data['EducDeg'].str.extract('(\d+)')
df_data['EducDeg'] = pd.to_numeric(df_data['EducDeg'])
# count of missing values
df_data.isna().sum()
FirstPolYear 30 EducDeg 17 MonthSal 36 GeoLivArea 1 Children 21 CustMonVal 0 ClaimsRate 0 PremMotor 34 PremHousehold 0 PremHealth 43 PremLife 104 PremWork 86 dtype: int64
df_data.columns
Index(['FirstPolYear', 'EducDeg', 'MonthSal', 'GeoLivArea', 'Children',
'CustMonVal', 'ClaimsRate', 'PremMotor', 'PremHousehold', 'PremHealth',
'PremLife', 'PremWork'],
dtype='object')
#Regression for First_Year with CMV, Education, Area
y,x = dmatrices('FirstPolYear ~ CustMonVal + EducDeg + Children', data = df_data, NA_action='drop', return_type='dataframe')
mod = sm.OLS(y,x)
res = mod.fit()
print(res.summary())
# 0% de explained variance, remove observations
OLS Regression Results
==============================================================================
Dep. Variable: FirstPolYear R-squared: 0.000
Model: OLS Adj. R-squared: -0.000
Method: Least Squares F-statistic: 0.6459
Date: Fri, 07 Jan 2022 Prob (F-statistic): 0.585
Time: 21:22:41 Log-Likelihood: -33896.
No. Observations: 10248 AIC: 6.780e+04
Df Residuals: 10244 BIC: 6.783e+04
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 1986.2495 0.237 8394.769 0.000 1985.786 1986.713
CustMonVal -1.206e-05 3.35e-05 -0.360 0.719 -7.77e-05 5.36e-05
EducDeg -0.0410 0.082 -0.499 0.618 -0.202 0.120
Children -0.1794 0.144 -1.250 0.211 -0.461 0.102
==============================================================================
Omnibus: 6471.747 Durbin-Watson: 2.010
Prob(Omnibus): 0.000 Jarque-Bera (JB): 577.286
Skew: -0.023 Prob(JB): 4.40e-126
Kurtosis: 1.838 Cond. No. 7.67e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.67e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
# Delete observations where living area and first year area null
df_data.dropna(subset = ['GeoLivArea'], inplace = True) # 1 observation dropped
df_data.dropna(subset = ['FirstPolYear'], inplace = True) # 29 observation dropped
# Replace null values in the Premium variables with 0
df_data['PremMotor'].fillna(0, inplace = True)
df_data['PremLife'].fillna(0, inplace = True)
df_data['PremHealth'].fillna(0, inplace = True)
df_data['PremWork'].fillna(0, inplace = True)
df_data.corr()
| FirstPolYear | EducDeg | MonthSal | GeoLivArea | Children | CustMonVal | ClaimsRate | PremMotor | PremHousehold | PremHealth | PremLife | PremWork | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| FirstPolYear | 1.000000 | -0.005482 | 0.023551 | -0.004275 | -0.012591 | -0.003507 | 0.003924 | -0.005718 | -0.009861 | 0.016356 | 0.010388 | 0.016744 |
| EducDeg | -0.005482 | 1.000000 | 0.150063 | -0.013773 | 0.002971 | -0.018080 | 0.005352 | 0.304337 | -0.284653 | -0.040032 | -0.429745 | -0.387029 |
| MonthSal | 0.023551 | 0.150063 | 1.000000 | 0.015045 | -0.393307 | -0.003212 | -0.003510 | 0.135669 | -0.133017 | 0.029169 | -0.195979 | -0.173589 |
| GeoLivArea | -0.004275 | -0.013773 | 0.015045 | 1.000000 | -0.020952 | -0.005876 | 0.007546 | 0.003331 | 0.010702 | 0.004968 | 0.012583 | 0.002995 |
| Children | -0.012591 | 0.002971 | -0.393307 | -0.020952 | 1.000000 | -0.000941 | -0.002228 | 0.154650 | -0.063367 | -0.201269 | -0.115685 | -0.086139 |
| CustMonVal | -0.003507 | -0.018080 | -0.003212 | -0.005876 | -0.000941 | 1.000000 | -0.992630 | 0.034011 | 0.032647 | -0.005584 | 0.010422 | 0.020164 |
| ClaimsRate | 0.003924 | 0.005352 | -0.003510 | 0.007546 | -0.002228 | -0.992630 | 1.000000 | -0.006265 | -0.008017 | 0.015770 | 0.001503 | -0.001069 |
| PremMotor | -0.005718 | 0.304337 | 0.135669 | 0.003331 | 0.154650 | 0.034011 | -0.006265 | 1.000000 | -0.273431 | -0.180741 | -0.406478 | -0.346461 |
| PremHousehold | -0.009861 | -0.284653 | -0.133017 | 0.010702 | -0.063367 | 0.032647 | -0.008017 | -0.273431 | 1.000000 | 0.025833 | 0.263584 | 0.239931 |
| PremHealth | 0.016356 | -0.040032 | 0.029169 | 0.004968 | -0.201269 | -0.005584 | 0.015770 | -0.180741 | 0.025833 | 1.000000 | 0.066444 | 0.180025 |
| PremLife | 0.010388 | -0.429745 | -0.195979 | 0.012583 | -0.115685 | 0.010422 | 0.001503 | -0.406478 | 0.263584 | 0.066444 | 1.000000 | 0.345508 |
| PremWork | 0.016744 | -0.387029 | -0.173589 | 0.002995 | -0.086139 | 0.020164 | -0.001069 | -0.346461 | 0.239931 | 0.180025 | 0.345508 | 1.000000 |
# use KNN to impute values on Children, MonthSal, EducDeg
metrics_to_impute = ['Children', 'MonthSal', 'EducDeg']
imputer = KNNImputer(n_neighbors=10, weights="uniform")
df_data[metrics_to_impute] = imputer.fit_transform(df_data[metrics_to_impute])
df_data.isna().sum()
FirstPolYear 0 EducDeg 0 MonthSal 0 GeoLivArea 0 Children 0 CustMonVal 0 ClaimsRate 0 PremMotor 0 PremHousehold 0 PremHealth 0 PremLife 0 PremWork 0 dtype: int64
fig = pltexp.histogram(df_data, x=df_data['CustMonVal'], color_discrete_sequence=['orange'], template='plotly_white')
fig.show()
fig = pltexp.box(df_data, y=df_data['CustMonVal'], color_discrete_sequence=['forestgreen'], template='plotly_white')
fig.show()
df_data=df_data[(df_data['CustMonVal']>=-5000) | (df_data['CustMonVal'].isnull())] #5 observation dropped
df_data=df_data[(df_data['CustMonVal']>=-2000) | (df_data['CustMonVal'].isnull())] #2 rows dropped
df_data = df_data[(df_data['CustMonVal']<=2000) | (df_data['CustMonVal'].isnull())] #12 rows dropped
df_data = df_data[(df_data['CustMonVal']<=1320) | (df_data['CustMonVal'].isnull())] #11 rows dropped
fig = pltexp.histogram(df_data, x=df_data['MonthSal'], color_discrete_sequence=['orange'], template='plotly_white')
fig.show()
fig = pltexp.box(df_data, y=df_data['MonthSal'], color_discrete_sequence=['forestgreen'], template='plotly_white')
fig.show()
df_data=df_data[(df_data['MonthSal']<=20000) | (df_data['MonthSal'].isnull())] #2 rows dropped
fig = pltexp.histogram(df_data, x=df_data['ClaimsRate'], color_discrete_sequence=['orange'], template='plotly_white')
fig.show()
fig = pltexp.box(df_data, y=df_data['ClaimsRate'], color_discrete_sequence=['forestgreen'], template='plotly_white')
fig.show()
df_data=df_data[(df_data['ClaimsRate']<=4) | (df_data['ClaimsRate'].isnull())]
fig = pltexp.histogram(df_data, x=df_data['PremMotor'], color_discrete_sequence=['orange'], template='plotly_white')
fig.show()
fig = pltexp.box(df_data, y=df_data['PremMotor'], color_discrete_sequence=['forestgreen'], template='plotly_white')
fig.show()
df_data=df_data[(df_data['PremMotor']<=2000) | (df_data['PremMotor'].isnull())]
fig = pltexp.histogram(df_data, x=df_data['PremHousehold'], color_discrete_sequence=['orange'], template='plotly_white')
fig.show()
fig = pltexp.box(df_data, y=df_data['PremHousehold'], color_discrete_sequence=['forestgreen'], template='plotly_white')
fig.show()
df_data=df_data[(df_data['PremHousehold']<=2500) | (df_data['PremHousehold'].isnull())]
df_data=df_data[(df_data['PremHousehold']<=1250) | (df_data['PremHousehold'].isnull())]
fig = pltexp.histogram(df_data, x=df_data['PremHealth'], color_discrete_sequence=['orange'], template='plotly_white')
fig.show()
fig = pltexp.box(df_data, y=df_data['PremHealth'], color_discrete_sequence=['forestgreen'], template='plotly_white')
fig.show()
df_data=df_data[(df_data['PremHealth']<=395) | (df_data['PremHealth'].isnull())]
fig = pltexp.histogram(df_data, x=df_data['PremLife'], color_discrete_sequence=['orange'], template='plotly_white')
fig.show()
fig = pltexp.box(df_data, y=df_data['PremLife'], color_discrete_sequence=['forestgreen'], template='plotly_white')
fig.show()
df_data=df_data[(df_data['PremLife']<=330) | (df_data['PremLife'].isnull())]
df_data=df_data[(df_data['PremLife']<=285) | (df_data['PremLife'].isnull())]
fig = pltexp.histogram(df_data, x=df_data['PremWork'], color_discrete_sequence=['orange'], template='plotly_white')
fig.show()
fig = pltexp.box(df_data, y=df_data['PremWork'], color_discrete_sequence=['forestgreen'], template='plotly_white')
fig.show()
df_data=df_data[(df_data['PremWork']<=350) | (df_data['PremWork'].isnull())]
df_data=df_data[(df_data['PremWork']<=300) | (df_data['PremWork'].isnull())]
df_data.shape
(10154, 12)
#Define non_metric and metric variables
non_metric_features = ['Children', 'EducDeg', 'FirstPolYear']
metric_features = ['MonthSal', 'GeoLivArea', 'CustMonVal', 'ClaimsRate', 'PremMotor', 'PremHousehold', 'PremHealth', 'PremLife', 'PremWork']
data_metric = df_data[metric_features].copy()
data_metric
| MonthSal | GeoLivArea | CustMonVal | ClaimsRate | PremMotor | PremHousehold | PremHealth | PremLife | PremWork | |
|---|---|---|---|---|---|---|---|---|---|
| CustID | |||||||||
| 1 | 2177.0 | 1.0 | 380.97 | 0.39 | 375.85 | 79.45 | 146.36 | 47.01 | 16.89 |
| 2 | 677.0 | 4.0 | -131.13 | 1.12 | 77.46 | 416.20 | 116.69 | 194.48 | 106.13 |
| 3 | 2277.0 | 3.0 | 504.67 | 0.28 | 206.15 | 224.50 | 124.58 | 86.35 | 99.02 |
| 4 | 1099.0 | 4.0 | -16.99 | 0.99 | 182.48 | 43.35 | 311.17 | 35.34 | 28.34 |
| 5 | 1763.0 | 4.0 | 35.23 | 0.90 | 338.62 | 47.80 | 182.59 | 18.78 | 41.45 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10291 | 626.0 | 3.0 | 176.26 | 0.85 | 6.89 | 878.50 | 103.13 | 113.02 | 201.26 |
| 10292 | 3188.0 | 2.0 | -0.11 | 0.96 | 393.74 | 49.45 | 173.81 | 9.78 | 14.78 |
| 10294 | 2918.0 | 1.0 | 524.10 | 0.21 | 403.63 | 132.80 | 142.25 | 12.67 | 4.89 |
| 10295 | 1971.0 | 2.0 | 250.05 | 0.65 | 188.59 | 211.15 | 198.37 | 63.90 | 112.91 |
| 10296 | 2815.0 | 1.0 | 463.75 | 0.27 | 414.08 | 94.45 | 141.25 | 6.89 | 12.89 |
10154 rows × 9 columns
iso = IsolationForest(contamination=0.01)
iso_df = iso.fit_predict(data_metric)
mask_out =iso_df == -1
outliers_if=data_metric[mask_out]
data_metric.drop(outliers_if.index,inplace=True)
print('Percentage of data removed after outliers:',
np.round(1 - np.round(data_metric.shape[0] / df_data.shape[0], 4), 5)*100, "%")
Percentage of data removed after outliers: 1.0 %
ee = EllipticEnvelope(contamination=0.01)
ee_df = ee.fit_predict(data_metric)
mask_out =ee_df == -1
outliers_mcd=data_metric[mask_out]
data_metric.drop(outliers_mcd.index,inplace=True)
print('Percentage of data removed after outliers:',
np.round(1 - np.round(data_metric.shape[0] / df_data.shape[0], 5), 5)*100, "%")
Percentage of data removed after outliers: 1.999 %
print("Total Outliers with MCD & ISO: " + str(len(outliers_if) + len(outliers_mcd)))
Total Outliers with MCD & ISO: 203
df_data.shape
(10154, 12)
df_pca = df_data.copy()
# Use PCA to reduce dimensionality of data
pca = PCA()
pca_feat = pca.fit_transform(df_pca)
# Output PCA table
pd.DataFrame(
{"Eigenvalue": pca.explained_variance_,
"Difference": np.insert(np.diff(pca.explained_variance_), 0, 0),
"Proportion": pca.explained_variance_ratio_,
"Cumulative": np.cumsum(pca.explained_variance_ratio_)},
index=range(1, pca.n_components_ + 1)
)
| Eigenvalue | Difference | Proportion | Cumulative | |
|---|---|---|---|---|
| 1 | 961415.725612 | 0.000000 | 8.750521e-01 | 0.875052 |
| 2 | 74325.978317 | -887089.747295 | 6.764930e-02 | 0.942701 |
| 3 | 46555.478310 | -27770.500008 | 4.237342e-02 | 0.985075 |
| 4 | 11866.506646 | -34688.971664 | 1.080054e-02 | 0.995875 |
| 5 | 3016.456733 | -8850.049913 | 2.745489e-03 | 0.998621 |
| 6 | 1220.636109 | -1795.820624 | 1.110987e-03 | 0.999732 |
| 7 | 248.929504 | -971.706605 | 2.265682e-04 | 0.999958 |
| 8 | 43.549569 | -205.379936 | 3.963753e-05 | 0.999998 |
| 9 | 1.601725 | -41.947844 | 1.457842e-06 | 0.999999 |
| 10 | 0.444091 | -1.157633 | 4.041987e-07 | 1.000000 |
| 11 | 0.133215 | -0.310876 | 1.212486e-07 | 1.000000 |
| 12 | 0.007380 | -0.125835 | 6.717009e-09 | 1.000000 |
# figure and axes
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
# draw plots
ax1.plot(pca.explained_variance_, marker=".", markersize=12)
ax2.plot(pca.explained_variance_ratio_, marker=".", markersize=12, label="Proportion")
ax2.plot(np.cumsum(pca.explained_variance_ratio_), marker=".", markersize=12, linestyle="--", label="Cumulative")
# customizations
ax2.legend()
ax1.set_title("Scree Plot", fontsize=14)
ax2.set_title("Variance Explained", fontsize=14)
ax1.set_ylabel("Eigenvalue")
ax2.set_ylabel("Proportion")
ax1.set_xlabel("Components")
ax2.set_xlabel("Components")
ax1.set_xticks(range(0, pca.n_components_, 2))
ax1.set_xticklabels(range(1, pca.n_components_ + 1, 2))
ax2.set_xticks(range(0, pca.n_components_, 2))
ax2.set_xticklabels(range(1, pca.n_components_ + 1, 2))
plt.show()
# Perform PCA again with the number of principal components you want to retain
pca = PCA(n_components = 2)
pca_feat = pca.fit_transform(df_pca[metric_features])
pca_feat_names = [f"PC{i}" for i in range(pca.n_components_)]
pca_df = pd.DataFrame(pca_feat, index=df_pca.index, columns=pca_feat_names) # remember index=df_pca.index
pca_df
| PC0 | PC1 | |
|---|---|---|
| CustID | ||
| 1 | -323.409962 | 22.383615 |
| 2 | -1842.236531 | -183.228899 |
| 3 | -240.055636 | 249.669411 |
| 4 | -1396.760164 | -319.902778 |
| 5 | -730.487728 | -278.216864 |
| ... | ... | ... |
| 10291 | -1925.049101 | 339.053258 |
| 10292 | 693.997100 | -245.284628 |
| 10294 | 412.273091 | 194.948218 |
| 10295 | -541.043368 | 32.521495 |
| 10296 | 312.795246 | 118.127467 |
10154 rows × 2 columns
outliers = list(outliers_mcd.index.tolist() + outliers_if.index.tolist())
def is_out(x):
if x in outliers:
return 'Outlier'
else:
return 'Not Outlier'
pca_df['OUTLIER'] = pca_df.reset_index()['CustID'].apply(is_out)
pca_df.describe(include='all')
| PC0 | PC1 | OUTLIER | |
|---|---|---|---|
| count | 1.015400e+04 | 1.015400e+04 | 10013 |
| unique | NaN | NaN | 2 |
| top | NaN | NaN | Not Outlier |
| freq | NaN | NaN | 9811 |
| mean | 3.250219e-13 | 1.707430e-14 | NaN |
| std | 9.805180e+02 | 2.726278e+02 | NaN |
| min | -2.219965e+03 | -5.679483e+02 | NaN |
| 25% | -7.794496e+02 | -1.996681e+02 | NaN |
| 50% | 8.413757e+00 | -1.630927e+01 | NaN |
| 75% | 7.862059e+02 | 1.216324e+02 | NaN |
| max | 2.507595e+03 | 1.393815e+03 | NaN |
fig = plt.figure(figsize=(10,10))
sns.scatterplot(data=pca_df, x="PC0", y="PC1", hue='OUTLIER', style="OUTLIER", palette="deep")
<AxesSubplot:xlabel='PC0', ylabel='PC1'>
df_data.drop(outliers_if.index,inplace=True)
df_data.drop(outliers_mcd.index,inplace=True)
df_data.shape
(9951, 12)
df_data['FirstPolYear'] = df_data['FirstPolYear'].astype(int)
df_data['EducDeg'] = df_data['EducDeg'].astype(int)
df_data['MonthSal'] = df_data['MonthSal'].astype(int)
df_data['GeoLivArea'] = df_data['GeoLivArea'].astype(int)
df_data['Children'] = df_data['Children'].astype(int)
df_data.columns
Index(['FirstPolYear', 'EducDeg', 'MonthSal', 'GeoLivArea', 'Children',
'CustMonVal', 'ClaimsRate', 'PremMotor', 'PremHousehold', 'PremHealth',
'PremLife', 'PremWork'],
dtype='object')
df_data['Years_As_Client'] = 2016 - df_data['FirstPolYear']
df_data['YearSal'] = df_data['MonthSal'] * 14
df_data['Total_Premiums'] = df_data.loc[:,['PremMotor','PremHousehold','PremHealth','PremLife','PremWork']][df_data > 0].sum(1)
# DELETE ROWS WHERE TOTAL_PREMIUMS EQUALS 0
df_data = df_data[df_data['Total_Premiums'] != 0] # 12 rows dropped
df_data['Negative'] = df_data.iloc[:,7:12][df_data<0].sum(1)
df_data['Cancelled'] = np.where(df_data['Negative']<0, 1, 0)
df_data['Negative'] = abs(df_data['Negative'])
df_data = df_data.drop(columns=['MonthSal','FirstPolYear', 'Negative'])
df_data
| EducDeg | GeoLivArea | Children | CustMonVal | ClaimsRate | PremMotor | PremHousehold | PremHealth | PremLife | PremWork | Years_As_Client | YearSal | Total_Premiums | Cancelled | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CustID | ||||||||||||||
| 1 | 2 | 1 | 1 | 380.97 | 0.39 | 375.85 | 79.45 | 146.36 | 47.01 | 16.89 | 31 | 30478 | 665.56 | 0 |
| 2 | 2 | 4 | 1 | -131.13 | 1.12 | 77.46 | 416.20 | 116.69 | 194.48 | 106.13 | 35 | 9478 | 910.96 | 0 |
| 3 | 1 | 3 | 0 | 504.67 | 0.28 | 206.15 | 224.50 | 124.58 | 86.35 | 99.02 | 25 | 31878 | 740.60 | 0 |
| 4 | 3 | 4 | 1 | -16.99 | 0.99 | 182.48 | 43.35 | 311.17 | 35.34 | 28.34 | 26 | 15386 | 600.68 | 0 |
| 5 | 3 | 4 | 1 | 35.23 | 0.90 | 338.62 | 47.80 | 182.59 | 18.78 | 41.45 | 30 | 24682 | 629.24 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10291 | 1 | 3 | 1 | 176.26 | 0.85 | 6.89 | 878.50 | 103.13 | 113.02 | 201.26 | 22 | 8764 | 1302.80 | 0 |
| 10292 | 4 | 2 | 0 | -0.11 | 0.96 | 393.74 | 49.45 | 173.81 | 9.78 | 14.78 | 32 | 44632 | 641.56 | 0 |
| 10294 | 3 | 1 | 1 | 524.10 | 0.21 | 403.63 | 132.80 | 142.25 | 12.67 | 4.89 | 22 | 40852 | 696.24 | 0 |
| 10295 | 1 | 2 | 1 | 250.05 | 0.65 | 188.59 | 211.15 | 198.37 | 63.90 | 112.91 | 35 | 27594 | 774.92 | 0 |
| 10296 | 4 | 1 | 1 | 463.75 | 0.27 | 414.08 | 94.45 | 141.25 | 6.89 | 12.89 | 26 | 39410 | 669.56 | 0 |
9951 rows × 14 columns
fig = pltexp.histogram(df_data, x=df_data['Total_Premiums'], color_discrete_sequence=['orange'], template='plotly_white')
fig.show()
fig = pltexp.box(df_data, y=df_data['Total_Premiums'], color_discrete_sequence=['forestgreen'], template='plotly_white')
fig.show()
df_data = df_data[(df_data['Total_Premiums']>=490) | (df_data['Total_Premiums'].isnull())]
df_data = df_data[(df_data['Total_Premiums']<=1520) | (df_data['Total_Premiums'].isnull())]
df_data.shape
(9940, 14)
heatmap_spearman(df_data.corr(method='spearman').abs())
heatmap_phik(df_data.phik_matrix().abs())
locator: <matplotlib.colorbar._ColorbarAutoLocator object at 0x000001D70C2AB6A0> Using auto colorbar locator <matplotlib.colorbar._ColorbarAutoLocator object at 0x000001D70C2AB6A0> on colorbar Setting pcolormesh
interval columns not set, guessing: ['EducDeg', 'GeoLivArea', 'Children', 'CustMonVal', 'ClaimsRate', 'PremMotor', 'PremHousehold', 'PremHealth', 'PremLife', 'PremWork', 'Years_As_Client', 'YearSal', 'Total_Premiums', 'Cancelled']
locator: <matplotlib.colorbar._ColorbarAutoLocator object at 0x000001D70C47EE20> Using auto colorbar locator <matplotlib.colorbar._ColorbarAutoLocator object at 0x000001D70C47EE20> on colorbar Setting pcolormesh
standard_df = pd.DataFrame(StandardScaler().fit(df_data).transform(df_data),index=df_data.index, columns=df_data.columns)
def plotKElbow(cluster,data,metric):
visualizer = KElbowVisualizer(cluster, k=(2,12), metric=metric, timings=False,locate_elbowbool=False)
visualizer.fit(data)
visualizer.show()
cluster = KMeans(random_state=42)
plotKElbow(cluster, standard_df, 'calinski_harabasz')
plotKElbow(cluster, standard_df, 'distortion')
def kmeans_cluster(df, n_clust):
kmclust = KMeans(n_clusters=n_clust, init='k-means++', n_init=15, random_state=1)
df_labels = kmclust.fit_predict(df)
return df_labels
standard_df['labels'] = kmeans_cluster(standard_df, 3)
# Preparing the data
X = standard_df.drop(columns = 'labels')
y = standard_df.labels
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Fitting the decision tree
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print("It is estimated that in average, we are able to predict {0:.2f}% of the customers correctly".format(rf.score(X_test, y_test)*100))
It is estimated that in average, we are able to predict 95.12% of the customers correctly
importances = rf.feature_importances_
indices = np.argsort(importances)
zippy = pd.DataFrame(zip(importances))
zippy['Features'] = X_train.columns
tidy = zippy.melt(id_vars='Features').rename(columns=str.title)
tidy.sort_values(['Value'], ascending = False, inplace = True)
plt.figure(figsize=(15,8))
sns.barplot(y='Features', x='Value', hue='Variable', data=tidy)
<AxesSubplot:xlabel='Value', ylabel='Features'>
standard_df.columns
Index(['EducDeg', 'GeoLivArea', 'Children', 'CustMonVal', 'ClaimsRate',
'PremMotor', 'PremHousehold', 'PremHealth', 'PremLife', 'PremWork',
'Years_As_Client', 'YearSal', 'Total_Premiums', 'Cancelled', 'labels'],
dtype='object')
selected_df=standard_df.loc[:,["PremWork", "PremMotor","PremHealth","PremLife", "Total_Premiums", "YearSal", "Years_As_Client", "CustMonVal", "Cancelled"]]
We implemented the following clustering algorithms:
def get_r2_hc(df, link_method, max_nclus, min_nclus=1, dist="euclidean"):
"""This function computes the R2 for a set of cluster solutions given by the application of a hierarchical method.
The R2 is a measure of the homogenity of a cluster solution. It is based on SSt = SSw + SSb and R2 = SSb/SSt.
Parameters:
df (DataFrame): Dataset to apply clustering
link_method (str): either "ward", "complete", "average", "single"
max_nclus (int): maximum number of clusters to compare the methods
min_nclus (int): minimum number of clusters to compare the methods. Defaults to 1.
dist (str): distance to use to compute the clustering solution. Must be a valid distance. Defaults to "euclidean".
Returns:
ndarray: R2 values for the range of cluster solutions
"""
def get_ss(df):
ss = np.sum(df.var() * (df.count() - 1))
return ss # return sum of sum of squares of each df variable
sst = get_ss(df) # get total sum of squares
r2 = [] # where we will store the R2 metrics for each cluster solution
for i in range(min_nclus, max_nclus+1): # iterate over desired ncluster range
cluster = AgglomerativeClustering(n_clusters=i, affinity=dist, linkage=link_method)
hclabels = cluster.fit_predict(df) #get cluster labels
df_concat = pd.concat((df, pd.Series(hclabels, name='labels')), axis=1) # concat df with labels
ssw_labels = df_concat.groupby(by='labels').apply(get_ss) # compute ssw for each cluster labels
ssb = sst - np.sum(ssw_labels) # remember: SST = SSW + SSB
r2.append(ssb / sst) # save the R2 of the given cluster solution
return np.array(r2)
def cluster_profiles(df, label_columns, figsize, compar_titles=None):
"""
Pass df with labels columns of one or multiple clustering labels.
Then specify this label columns to perform the cluster profile according to them.
"""
if compar_titles == None:
compar_titles = [""]*len(label_columns)
sns.set()
fig, axes = plt.subplots(nrows=len(label_columns), ncols=2, figsize=figsize, squeeze=False)
for ax, label, titl in zip(axes, label_columns, compar_titles):
# Filtering df
drop_cols = [i for i in label_columns if i!=label]
dfax = df.drop(drop_cols, axis=1)
# Getting the cluster centroids and counts
centroids = dfax.groupby(by=label, as_index=False).mean()
counts = dfax.groupby(by=label, as_index=False).count().iloc[:,[0,1]]
counts.columns = [label, "counts"]
# Setting Data
pd.plotting.parallel_coordinates(centroids, label, color=sns.color_palette(), ax=ax[0])
sns.barplot(x=label, y="counts", data=counts, ax=ax[1])
#Setting Layout
handles, _ = ax[0].get_legend_handles_labels()
cluster_labels = ["Cluster {}".format(i) for i in range(len(handles))]
#ax[0].annotate(text=titl, xy=(0.95,1.1), xycoords='axes fraction', fontsize=13, fontweight = 'heavy')
ax[0].legend(handles, cluster_labels) # Adaptable to number of clusters
ax[0].axhline(color="black", linestyle="--")
ax[0].set_title("Cluster Means - {} Clusters".format(len(handles)), fontsize=13)
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=-20)
ax[1].set_xticklabels(cluster_labels)
ax[1].set_xlabel("")
ax[1].set_ylabel("Absolute Frequency")
ax[1].set_title("Cluster Sizes - {} Clusters".format(len(handles)), fontsize=13)
plt.subplots_adjust(hspace=0.4, top=0.90)
plt.suptitle("Cluster Simple Profilling", fontsize=23)
plt.show()
def get_ss(df):
"""Computes the sum of squares for all variables given a dataset
"""
ss = np.sum(df.var() * (df.count() - 1))
return ss # return sum of sum of squares of each df variable
def gmm_cluster(data, nclust):
gmm = GaussianMixture(n_components = nclust, init_params = 'kmeans', random_state=0)
gmm.fit(data)
clust_labels = gmm.predict(data)
return(clust_labels)
def avg_silhouette(min_range, max_range, df,cluster):
avg_silhouette = []
for nclus in range(min_range,max_range):
# Skip nclus == 1<div class="alert alert-block alert-info">
### Standardaization
if nclus == 1:
continue
# Create a figure
fig = plt.figure(figsize=(10,5 ))
if "n_clusters" in cluster.get_params():
param = "n_clusters"
else:
param = "n_components"
# Get the cluster labels by applying a given clustering algorithm
clustering = cluster.set_params(**{param:nclus})
cluster_labels = clustering.fit_predict(df)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed clusters
silhouette_avg = silhouette_score(df, cluster_labels)
avg_silhouette.append(silhouette_avg)
print(f"For n_clusters = {nclus}, the average silhouette_score is : {silhouette_avg}")
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(df, cluster_labels)
y_lower = 10
for i in range(nclus):
# Aggregate the silhouette scores for samples belonging to cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
# Get y_upper to demarcate silhouette y range size
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
# Filling the silhouette
color = cm.nipy_spectral(float(i) / nclus)
plt.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
plt.title("The silhouette plot for the various clusters.")
plt.xlabel("The silhouette coefficient values")
plt.ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
plt.axvline(x=silhouette_avg, color="red", linestyle="--")
# The silhouette coefficient can range from -1, 1
xmin, xmax = np.round(sample_silhouette_values.min() -0.1, 2), np.round(sample_silhouette_values.max() + 0.1, 2)
plt.xlim([xmin, xmax])
# The (nclus+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
plt.ylim([0, len(df) + (nclus + 1) * 10])
plt.yticks([]) # Clear the yaxis labels / ticks
plt.xticks(np.arange(xmin, xmax, 0.1))
kmeans_df = selected_df.copy()
kmeans_df['labels'] = kmeans_cluster(kmeans_df, 3)
cluster_profiles(kmeans_df, ["labels"], figsize=(22, 8), compar_titles=["K-Means - All Data"])
# This som implementation does not have a random seed parameter
# We're going to set it up ourselves
np.random.seed(42)
sm = sompy.SOMFactory().build(
selected_df.values,
mapsize=(10, 10),
initialization='random',
neighborhood='gaussian',
training='batch',
lattice='hexa',
component_names=selected_df.columns
)
sm.train(n_job=4, verbose='info', train_rough_len=100, train_finetune_len=100)
Training... random_initialization took: 0.000000 seconds Rough training... radius_ini: 4.000000 , radius_final: 1.000000, trainlen: 100 epoch: 1 ---> elapsed time: 0.029000, quantization error: 3.328756 epoch: 2 ---> elapsed time: 0.029000, quantization error: 2.807953 epoch: 3 ---> elapsed time: 0.030000, quantization error: 2.729498 epoch: 4 ---> elapsed time: 0.032000, quantization error: 2.692212 epoch: 5 ---> elapsed time: 0.031000, quantization error: 2.673920 epoch: 6 ---> elapsed time: 0.031000, quantization error: 2.657357 epoch: 7 ---> elapsed time: 0.030000, quantization error: 2.649784 epoch: 8 ---> elapsed time: 0.031000, quantization error: 2.646208 epoch: 9 ---> elapsed time: 0.030000, quantization error: 2.643262 epoch: 10 ---> elapsed time: 0.029000, quantization error: 2.640452 epoch: 11 ---> elapsed time: 0.030000, quantization error: 2.637641 epoch: 12 ---> elapsed time: 0.031000, quantization error: 2.634793 epoch: 13 ---> elapsed time: 0.033000, quantization error: 2.631891 epoch: 14 ---> elapsed time: 0.031000, quantization error: 2.628932 epoch: 15 ---> elapsed time: 0.030000, quantization error: 2.625917 epoch: 16 ---> elapsed time: 0.033000, quantization error: 2.622843 epoch: 17 ---> elapsed time: 0.032000, quantization error: 2.619716 epoch: 18 ---> elapsed time: 0.030000, quantization error: 2.616523 epoch: 19 ---> elapsed time: 0.033000, quantization error: 2.613283 epoch: 20 ---> elapsed time: 0.032000, quantization error: 2.609974 epoch: 21 ---> elapsed time: 0.031000, quantization error: 2.606577 epoch: 22 ---> elapsed time: 0.031000, quantization error: 2.603096 epoch: 23 ---> elapsed time: 0.031000, quantization error: 2.599535 epoch: 24 ---> elapsed time: 0.032000, quantization error: 2.595916 epoch: 25 ---> elapsed time: 0.029000, quantization error: 2.592225 epoch: 26 ---> elapsed time: 0.030000, quantization error: 2.588460 epoch: 27 ---> elapsed time: 0.034000, quantization error: 2.584622 epoch: 28 ---> elapsed time: 0.031000, quantization error: 2.580693 epoch: 29 ---> elapsed time: 0.030000, quantization error: 2.576681 epoch: 30 ---> elapsed time: 0.031000, quantization error: 2.572599 epoch: 31 ---> elapsed time: 0.030000, quantization error: 2.568428 epoch: 32 ---> elapsed time: 0.032000, quantization error: 2.564176 epoch: 33 ---> elapsed time: 0.036000, quantization error: 2.559829 epoch: 34 ---> elapsed time: 0.034000, quantization error: 2.555368 epoch: 35 ---> elapsed time: 0.031000, quantization error: 2.550768 epoch: 36 ---> elapsed time: 0.035000, quantization error: 2.546116 epoch: 37 ---> elapsed time: 0.037000, quantization error: 2.541385 epoch: 38 ---> elapsed time: 0.037000, quantization error: 2.536539 epoch: 39 ---> elapsed time: 0.034000, quantization error: 2.531562 epoch: 40 ---> elapsed time: 0.032000, quantization error: 2.526486 epoch: 41 ---> elapsed time: 0.030000, quantization error: 2.521327 epoch: 42 ---> elapsed time: 0.031000, quantization error: 2.516022 epoch: 43 ---> elapsed time: 0.038000, quantization error: 2.510601 epoch: 44 ---> elapsed time: 0.031000, quantization error: 2.505045 epoch: 45 ---> elapsed time: 0.030000, quantization error: 2.499332 epoch: 46 ---> elapsed time: 0.031000, quantization error: 2.493471 epoch: 47 ---> elapsed time: 0.030000, quantization error: 2.487558 epoch: 48 ---> elapsed time: 0.031000, quantization error: 2.481532 epoch: 49 ---> elapsed time: 0.030000, quantization error: 2.475361 epoch: 50 ---> elapsed time: 0.030000, quantization error: 2.468969 epoch: 51 ---> elapsed time: 0.032000, quantization error: 2.462424 epoch: 52 ---> elapsed time: 0.030000, quantization error: 2.455682 epoch: 53 ---> elapsed time: 0.032000, quantization error: 2.448689 epoch: 54 ---> elapsed time: 0.029000, quantization error: 2.441475 epoch: 55 ---> elapsed time: 0.029000, quantization error: 2.433973 epoch: 56 ---> elapsed time: 0.030000, quantization error: 2.426087 epoch: 57 ---> elapsed time: 0.029000, quantization error: 2.417752 epoch: 58 ---> elapsed time: 0.028000, quantization error: 2.408939 epoch: 59 ---> elapsed time: 0.029000, quantization error: 2.399916 epoch: 60 ---> elapsed time: 0.029000, quantization error: 2.390895 epoch: 61 ---> elapsed time: 0.030000, quantization error: 2.381592 epoch: 62 ---> elapsed time: 0.029000, quantization error: 2.371494 epoch: 63 ---> elapsed time: 0.029000, quantization error: 2.360386 epoch: 64 ---> elapsed time: 0.029000, quantization error: 2.349137 epoch: 65 ---> elapsed time: 0.029000, quantization error: 2.338529 epoch: 66 ---> elapsed time: 0.031000, quantization error: 2.328114 epoch: 67 ---> elapsed time: 0.030000, quantization error: 2.317227 epoch: 68 ---> elapsed time: 0.029000, quantization error: 2.305617 epoch: 69 ---> elapsed time: 0.033000, quantization error: 2.292828 epoch: 70 ---> elapsed time: 0.029000, quantization error: 2.279349 epoch: 71 ---> elapsed time: 0.030000, quantization error: 2.265561 epoch: 72 ---> elapsed time: 0.030000, quantization error: 2.249351 epoch: 73 ---> elapsed time: 0.030000, quantization error: 2.235623 epoch: 74 ---> elapsed time: 0.031000, quantization error: 2.221473 epoch: 75 ---> elapsed time: 0.030000, quantization error: 2.207082 epoch: 76 ---> elapsed time: 0.029000, quantization error: 2.193940 epoch: 77 ---> elapsed time: 0.028000, quantization error: 2.180715 epoch: 78 ---> elapsed time: 0.028000, quantization error: 2.167171 epoch: 79 ---> elapsed time: 0.031000, quantization error: 2.153409 epoch: 80 ---> elapsed time: 0.030000, quantization error: 2.139505 epoch: 81 ---> elapsed time: 0.030000, quantization error: 2.125373 epoch: 82 ---> elapsed time: 0.028000, quantization error: 2.110921 epoch: 83 ---> elapsed time: 0.030000, quantization error: 2.096243 epoch: 84 ---> elapsed time: 0.029000, quantization error: 2.081279 epoch: 85 ---> elapsed time: 0.029000, quantization error: 2.065884 epoch: 86 ---> elapsed time: 0.031000, quantization error: 2.050119 epoch: 87 ---> elapsed time: 0.029000, quantization error: 2.034052 epoch: 88 ---> elapsed time: 0.029000, quantization error: 2.017710 epoch: 89 ---> elapsed time: 0.030000, quantization error: 2.001165 epoch: 90 ---> elapsed time: 0.029000, quantization error: 1.984399 epoch: 91 ---> elapsed time: 0.032000, quantization error: 1.967289 epoch: 92 ---> elapsed time: 0.031000, quantization error: 1.949875 epoch: 93 ---> elapsed time: 0.030000, quantization error: 1.932232 epoch: 94 ---> elapsed time: 0.030000, quantization error: 1.913862 epoch: 95 ---> elapsed time: 0.029000, quantization error: 1.894767 epoch: 96 ---> elapsed time: 0.028000, quantization error: 1.874759 epoch: 97 ---> elapsed time: 0.029000, quantization error: 1.853802 epoch: 98 ---> elapsed time: 0.030000, quantization error: 1.830701 epoch: 99 ---> elapsed time: 0.030000, quantization error: 1.803846 epoch: 100 ---> elapsed time: 0.029000, quantization error: 1.775591 Finetune training... radius_ini: 1.000000 , radius_final: 1.000000, trainlen: 100 epoch: 1 ---> elapsed time: 0.032000, quantization error: 1.749095 epoch: 2 ---> elapsed time: 0.029000, quantization error: 1.742448 epoch: 3 ---> elapsed time: 0.030000, quantization error: 1.738921 epoch: 4 ---> elapsed time: 0.031000, quantization error: 1.737030 epoch: 5 ---> elapsed time: 0.030000, quantization error: 1.735812 epoch: 6 ---> elapsed time: 0.030000, quantization error: 1.734870 epoch: 7 ---> elapsed time: 0.029000, quantization error: 1.733899 epoch: 8 ---> elapsed time: 0.029000, quantization error: 1.733095 epoch: 9 ---> elapsed time: 0.029000, quantization error: 1.732406 epoch: 10 ---> elapsed time: 0.029000, quantization error: 1.731589 epoch: 11 ---> elapsed time: 0.030000, quantization error: 1.730649 epoch: 12 ---> elapsed time: 0.031000, quantization error: 1.729728 epoch: 13 ---> elapsed time: 0.028000, quantization error: 1.729105 epoch: 14 ---> elapsed time: 0.031000, quantization error: 1.728699 epoch: 15 ---> elapsed time: 0.030000, quantization error: 1.728444 epoch: 16 ---> elapsed time: 0.030000, quantization error: 1.728284 epoch: 17 ---> elapsed time: 0.029000, quantization error: 1.728275 epoch: 18 ---> elapsed time: 0.029000, quantization error: 1.728291 epoch: 19 ---> elapsed time: 0.030000, quantization error: 1.728313 epoch: 20 ---> elapsed time: 0.030000, quantization error: 1.728330 epoch: 21 ---> elapsed time: 0.031000, quantization error: 1.728320 epoch: 22 ---> elapsed time: 0.030000, quantization error: 1.728326 epoch: 23 ---> elapsed time: 0.031000, quantization error: 1.728310 epoch: 24 ---> elapsed time: 0.030000, quantization error: 1.728275 epoch: 25 ---> elapsed time: 0.032000, quantization error: 1.728191 epoch: 26 ---> elapsed time: 0.029000, quantization error: 1.728156 epoch: 27 ---> elapsed time: 0.031000, quantization error: 1.728091 epoch: 28 ---> elapsed time: 0.030000, quantization error: 1.728038 epoch: 29 ---> elapsed time: 0.030000, quantization error: 1.728031 epoch: 30 ---> elapsed time: 0.032000, quantization error: 1.728040 epoch: 31 ---> elapsed time: 0.030000, quantization error: 1.728018 epoch: 32 ---> elapsed time: 0.031000, quantization error: 1.728008 epoch: 33 ---> elapsed time: 0.030000, quantization error: 1.727997 epoch: 34 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 35 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 36 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 37 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 38 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 39 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 40 ---> elapsed time: 0.032000, quantization error: 1.727989 epoch: 41 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 42 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 43 ---> elapsed time: 0.028000, quantization error: 1.727989 epoch: 44 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 45 ---> elapsed time: 0.032000, quantization error: 1.727989 epoch: 46 ---> elapsed time: 0.031000, quantization error: 1.727989 epoch: 47 ---> elapsed time: 0.031000, quantization error: 1.727989 epoch: 48 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 49 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 50 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 51 ---> elapsed time: 0.031000, quantization error: 1.727989 epoch: 52 ---> elapsed time: 0.031000, quantization error: 1.727989 epoch: 53 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 54 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 55 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 56 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 57 ---> elapsed time: 0.028000, quantization error: 1.727989 epoch: 58 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 59 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 60 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 61 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 62 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 63 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 64 ---> elapsed time: 0.028000, quantization error: 1.727989 epoch: 65 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 66 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 67 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 68 ---> elapsed time: 0.031000, quantization error: 1.727989 epoch: 69 ---> elapsed time: 0.028000, quantization error: 1.727989 epoch: 70 ---> elapsed time: 0.031000, quantization error: 1.727989 epoch: 71 ---> elapsed time: 0.028000, quantization error: 1.727989 epoch: 72 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 73 ---> elapsed time: 0.028000, quantization error: 1.727989 epoch: 74 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 75 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 76 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 77 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 78 ---> elapsed time: 0.031000, quantization error: 1.727989 epoch: 79 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 80 ---> elapsed time: 0.032000, quantization error: 1.727989 epoch: 81 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 82 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 83 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 84 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 85 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 86 ---> elapsed time: 0.031000, quantization error: 1.727989 epoch: 87 ---> elapsed time: 0.032000, quantization error: 1.727989 epoch: 88 ---> elapsed time: 0.029000, quantization error: 1.727989 epoch: 89 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 90 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 91 ---> elapsed time: 0.031000, quantization error: 1.727989 epoch: 92 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 93 ---> elapsed time: 0.032000, quantization error: 1.727989 epoch: 94 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 95 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 96 ---> elapsed time: 0.032000, quantization error: 1.727989 epoch: 97 ---> elapsed time: 0.030000, quantization error: 1.727989 epoch: 98 ---> elapsed time: 0.039000, quantization error: 1.727989 epoch: 99 ---> elapsed time: 0.034000, quantization error: 1.727989 epoch: 100 ---> elapsed time: 0.033000, quantization error: 1.727989 Final quantization error: 1.727989 train took: 6.178000 seconds
# Visualizing the Component planes (feature values)
sns.set()
view2D = View2D(12,12,"", text_size=10)
view2D.show(sm, col_sz=3, what='codebook')
plt.subplots_adjust(top=0.90)
plt.suptitle("Component Planes", fontsize=20)
plt.show()
plotKElbow(cluster,selected_df,'calinski_harabasz')
plotKElbow(cluster,selected_df,'distortion')
kmeans = KMeans(n_clusters = 3, init='k-means++', n_init=20, random_state=42)
nodeclus_labels = sm.cluster(kmeans)
hits = HitMapView(12, 12,"Clustering", text_size=10)
#hits.show(sm, anotate=True, onlyzeros=Falsace, labelsize=7, cmap="Pastel1")
plt.show()
nodes = sm.get_node_vectors()
kmeans_nodes = pd.DataFrame(nodes, columns=selected_df.columns)
kmeans_nodes['labels']= nodeclus_labels
# Obtaining SOM's BMUs labels
bmus_map_kmeans = sm.find_bmu(selected_df)[0] # get bmus for each observation in df
kmeans_bmus = pd.DataFrame(
np.concatenate((selected_df, np.expand_dims(bmus_map_kmeans,1)), axis=1),
index=selected_df.index, columns=np.append(selected_df.columns,"BMU")
)
# Get cluster labels for each observation
SOM_Kmeans_df = kmeans_bmus.merge(kmeans_nodes['labels'], 'left', left_on="BMU", right_index=True)
cluster_profiles(SOM_Kmeans_df.drop('BMU',axis=1),["labels"], figsize=(23, 7),compar_titles=["K-Means + HC - Value"])
# Obtaining SOM's BMUs labels
bmus_map_hc = sm.find_bmu(selected_df)[0] # get bmus for each observation in df
df_bmus_hc = pd.DataFrame(
np.concatenate((selected_df, np.expand_dims(bmus_map_hc,1)), axis=1),
index=selected_df.index, columns=np.append(selected_df.columns,"BMU")
)
df_bmus_hc = df_bmus_hc.groupby("BMU").mean()
# Prepare input
hc_methods = ["ward", "complete", "average", "single"]
# Call function defined above to obtain the R2 statistic for each hc_method
max_nclus = 10
r2_hc_methods = np.vstack([get_r2_hc(df=df_bmus_hc, link_method=link, max_nclus=max_nclus) for link in hc_methods]).T
r2_hc_methods = pd.DataFrame(r2_hc_methods, index=range(1, max_nclus + 1), columns=hc_methods)
sns.set()
# Plot data
fig = plt.figure(figsize=(11,5))
sns.lineplot(data=r2_hc_methods, linewidth=2.5, markers=["o"]*4)
# Finalize the plot
fig.suptitle("R2 plot for various hierarchical methods", fontsize=21)
plt.gca().invert_xaxis() # invert x axis
plt.legend(title="HC methods", title_fontsize=11)
plt.xticks(range(1, max_nclus + 1))
plt.xlabel("Number of clusters", fontsize=13)
plt.ylabel("R2 metric", fontsize=13)
plt.show()
linkage = 'ward'
distance = 'euclidean'
hclust = AgglomerativeClustering(linkage=linkage, affinity=distance, distance_threshold=0, n_clusters=None)
hclabel=hclust.fit_predict(df_bmus_hc)
# Adapted from:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html#sphx-glr-auto-examples-cluster-plot-agglomerative-dendrogram-py
# create the counts of samples under each node (number of points being merged)
counts = np.zeros(hclust.children_.shape[0])
n_samples = len(hclust.labels_)
# hclust.children_ contains the observation ids that are being merged together
# At the i-th iteration, children[i][0] and children[i][1] are merged to form node n_samples + i
for i, merge in enumerate(hclust.children_):
# track the number of observations in the current cluster being formed
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
# If this is True, then we are merging an observation
current_count += 1 # leaf node
else:
# Otherwise, we are merging a previously formed cluster
current_count += counts[child_idx - n_samples]
counts[i] = current_count
# the hclust.children_ is used to indicate the two points/clusters being merged (dendrogram's u-joins)
# the hclust.distances_ indicates the distance between the two points/clusters (height of the u-joins)
# the counts indicate the number of points being merged (dendrogram's x-axis)
linkage_matrix = np.column_stack(
[hclust.children_, hclust.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram
sns.set()
fig = plt.figure(figsize=(11,5))
# The Dendrogram parameters need to be tuned
y_threshold = 7.5
dendrogram(linkage_matrix, truncate_mode='level', p=5, color_threshold=y_threshold, above_threshold_color='k')
plt.hlines(y_threshold, 0, 1000, colors="r", linestyles="dashed")
plt.title(f'Hierarchical Clustering - {linkage.title()}\'s Dendrogram', fontsize=21)
plt.xlabel('Number of points in node (or index of point if no parenthesis)')
plt.ylabel(f'{distance.title()} Distance', fontsize=13)
plt.show()
hierclust = AgglomerativeClustering(n_clusters = 3, linkage='ward')
hc_labels = sm.cluster(hierclust)
# Check the nodes and and respective clusters
nodes = sm.get_node_vectors()
hc_nodes = pd.DataFrame(nodes, columns=selected_df.columns)
hc_nodes['labels'] = hc_labels
# Get cluster labels for each observation
som_hc_final = df_bmus_hc.merge(hc_nodes['labels'], 'left', left_on="BMU", right_index=True)
cluster_profiles(som_hc_final,["labels"], figsize=(23, 7),compar_titles=["SOM + HC - Value"])
MBKMeans_df = selected_df.copy()
MiniBatch = MiniBatchKMeans(n_clusters = 3,random_state=0)
plotKElbow(MiniBatch, selected_df,'calinski_harabasz')
plotKElbow(MiniBatch, selected_df,'distortion')
miniBatch = MiniBatchKMeans(n_clusters=3, init='k-means++', n_init=15, random_state=1)
miniBatch_labels = miniBatch.fit_predict(selected_df)
miniBatch_df = selected_df.copy()
miniBatch_df['labels']=miniBatch_labels
cluster_profiles(miniBatch_df,["labels"], figsize=(23, 7),compar_titles=["MiniBatchKMeans Clustering"])
gmm_df = selected_df.copy()
gmm = GaussianMixture(init_params='kmeans', random_state=0)
avg_silhouette(2,6,gmm_df,gmm)
For n_clusters = 2, the average silhouette_score is : 0.1880832065049186 For n_clusters = 3, the average silhouette_score is : 0.13958408157171795 For n_clusters = 4, the average silhouette_score is : 0.1602763255008471 For n_clusters = 5, the average silhouette_score is : 0.11402720490317174
gmm_df['labels']=gmm_cluster(gmm_df, 3)
cluster_profiles(gmm_df, ["labels"], figsize=(23, 7), compar_titles=[ "GMM - All Data"])
birch_df = selected_df.copy()
bclust = Birch(branching_factor=100, threshold=.5, n_clusters = 3).fit(birch_df)
labels = bclust.predict(birch_df)
birch_df['labels'] = labels
cluster_profiles(birch_df, ["labels"], figsize=(23, 7), compar_titles=["Birch Clustering"])
hc_df = selected_df.copy()
# Prepare input
hc_methods = ["ward", "complete", "average", "single"]
# Call function defined above to obtain the R2 statistic for each hc_method
max_nclus = 10
r2_hc_methods = np.vstack([get_r2_hc(hc_df, link_method=link, max_nclus=max_nclus) for link in hc_methods]).T
r2_hc_methods = pd.DataFrame(r2_hc_methods, index=range(1, max_nclus + 1), columns=hc_methods)
sns.set()
# Plot data
fig = plt.figure(figsize=(11,5))
sns.lineplot(data=r2_hc_methods, linewidth=2.5, markers=["o"]*4)
# Finalize the plot
fig.suptitle("R2 plot for various hierarchical methods", fontsize=21)
plt.gca().invert_xaxis() # invert x axis
plt.legend(title="HC methods", title_fontsize=11)
plt.xticks(range(1, max_nclus + 1))
plt.xlabel("Number of clusters", fontsize=13)
plt.ylabel("R2 metric", fontsize=13)
plt.show()
# Adapted from:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html#sphx-glr-auto-examples-cluster-plot-agglomerative-dendrogram-py
# create the counts of samples under each node (number of points being merged)
counts = np.zeros(hclust.children_.shape[0])
n_samples = len(hclust.labels_)
# hclust.children_ contains the observation ids that are being merged together
# At the i-th iteration, children[i][0] and children[i][1] are merged to form node n_samples + i
for i, merge in enumerate(hclust.children_):
# track the number of observations in the current cluster being formed
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
# If this is True, then we are merging an observation
current_count += 1 # leaf node
else:
# Otherwise, we are merging a previously formed cluster
current_count += counts[child_idx - n_samples]
counts[i] = current_count
# the hclust.children_ is used to indicate the two points/clusters being merged (dendrogram's u-joins)
# the hclust.distances_ indicates the distance between the two points/clusters (height of the u-joins)
# the counts indicate the number of points being merged (dendrogram's x-axis)
linkage_matrix = np.column_stack(
[hclust.children_, hclust.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram
sns.set()
fig = plt.figure(figsize=(11,5))
# The Dendrogram parameters need to be tuned
y_threshold = 8
dendrogram(linkage_matrix, truncate_mode='level', p=5, color_threshold=y_threshold, above_threshold_color='k')
plt.hlines(y_threshold, 0, 1000, colors="r", linestyles="dashed")
plt.title(f'Hierarchical Clustering - {linkage.title()}\'s Dendrogram', fontsize=21)
plt.xlabel('Number of points in node (or index of point if no parenthesis)')
plt.ylabel(f'{distance.title()} Distance', fontsize=13)
plt.show()
# setting distance_threshold=0 and n_clusters=None ensures we compute the full tree
linkage = 'ward'
distance = 'euclidean'
hclust = AgglomerativeClustering(linkage=linkage, affinity=distance, n_clusters=3)
hc_df['labels']=hclust.fit_predict(hc_df)
cluster_profiles(hc_df, ["labels"], figsize=(23, 7), compar_titles=[ "HC - All Data"])
dbscan_df = selected_df.copy()
# K-distance graph to find out the right eps value
neigh = NearestNeighbors(n_neighbors=20)
neigh.fit(dbscan_df)
distances, _ = neigh.kneighbors(dbscan_df)
distances = np.sort(distances[:, -1])
plt.plot(distances)
plt.show()
# Perform DBSCAN clustering
dbscan = DBSCAN(eps=1.6, min_samples=20, n_jobs=4)
dbscan_df['labels'] = dbscan.fit_predict(dbscan_df)
print("Number of estimated clusters : %d" % len(np.unique(dbscan_df['labels'])))
Number of estimated clusters : 3
cluster_profiles(dbscan_df, ["labels"], figsize=(23, 7), compar_titles=[ "HC - All Data"])
def plot_R2(df,column):
sst = get_ss(df)
df_concat = pd.concat((df, pd.Series(column, name='label')), axis=1) # concat df with labels
ssw_labels = df_concat.groupby(by='label').apply(get_ss) # compute ssw for each cluster labels
ssb = sst - np.sum(ssw_labels) # remember: SST = SSW + SSB
return ssb / sst
R2_Kmeans = plot_R2(kmeans_df, kmeans_df['labels'])
R2_SOM_Kmeans = plot_R2(SOM_Kmeans_df, SOM_Kmeans_df['labels'])
R2_SOM_HC = plot_R2(som_hc_final, som_hc_final['labels'])
R2_MBKMeans = plot_R2(miniBatch_df, miniBatch_df['labels'])
R2_GMM = plot_R2(gmm_df, gmm_df['labels'])
R2_Birch = plot_R2(birch_df, birch_df['labels'])
R2_HC = plot_R2(hc_df, hc_df['labels'])
R2_DBSCAN = plot_R2(dbscan_df, dbscan_df['labels'])
r2_solutions = np.asarray([R2_Kmeans, R2_SOM_Kmeans, R2_SOM_HC, R2_MBKMeans, R2_GMM, R2_Birch, R2_HC, R2_DBSCAN])
zippy = pd.DataFrame(zip(r2_solutions))
zippy['Algorithms'] = ['Kmeans', 'Som + Kmeans','Som + HC', 'MiniBatchKmeans', 'GMM', 'Birch', 'HC', 'DBSCAN']
tidy = zippy.melt(id_vars='Algorithms').rename(columns=str.title)
tidy.sort_values(['Value'], ascending = False, inplace = True)
plt.figure(figsize=(10,6))
sns.barplot(y='Algorithms', x='Value', data=tidy)
<AxesSubplot:xlabel='Value', ylabel='Algorithms'>
df_data.columns
Index(['EducDeg', 'GeoLivArea', 'Children', 'CustMonVal', 'ClaimsRate',
'PremMotor', 'PremHousehold', 'PremHealth', 'PremLife', 'PremWork',
'Years_As_Client', 'YearSal', 'Total_Premiums', 'Cancelled'],
dtype='object')
categorical = df_data[['EducDeg', 'GeoLivArea', 'Children']].copy()
categorical.reset_index(inplace=True)
categorical.drop("CustID", axis=1, inplace=True)
response = {0:'No',1:'Yes'}
categorical.replace(response, inplace=True)
ohe = OneHotEncoder()
cluster_labels = pd.DataFrame(ohe.fit_transform(som_hc_final["labels"].values.reshape(-1,1)).toarray(),index=som_hc_final.index, columns=["cluster_0", "cluster_1", "cluster_2"])
cluster_labels
| cluster_0 | cluster_1 | cluster_2 | |
|---|---|---|---|
| BMU | |||
| 0.0 | 1.0 | 0.0 | 0.0 |
| 1.0 | 1.0 | 0.0 | 0.0 |
| 2.0 | 1.0 | 0.0 | 0.0 |
| 3.0 | 0.0 | 1.0 | 0.0 |
| 4.0 | 0.0 | 1.0 | 0.0 |
| ... | ... | ... | ... |
| 94.0 | 1.0 | 0.0 | 0.0 |
| 95.0 | 0.0 | 1.0 | 0.0 |
| 97.0 | 0.0 | 1.0 | 0.0 |
| 98.0 | 0.0 | 1.0 | 0.0 |
| 99.0 | 0.0 | 1.0 | 0.0 |
87 rows × 3 columns
string_labels = {0:'No',1:'Yes'}
cluster_labels.replace(string_labels, inplace=True)
mca_df = categorical.merge(cluster_labels,left_index=True, right_index=True)
mca= prince.MCA(
n_components = 3,
n_iter=3,
copy=True,
check_input=False,
engine='auto',
random_state=42
)
mca = mca.fit(mca_df)
mca.plot_coordinates(
X=mca_df,
figsize=(15, 9),
show_row_points=False,
row_points_size=0,
show_row_labels=False,
show_column_points=True,
column_points_size=50,
show_column_labels=True,
)
<AxesSubplot:title={'center':'Row and column principal coordinates'}, xlabel='Component 0 (18.90% inertia)', ylabel='Component 1 (15.47% inertia)'>